knitr::opts_chunk$set(fig.align="center")
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr)
library(ggplot2)
library(magrittr)
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)
theme_set(theme_light())
source('helper_functions.R')
In our experiment, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then asked the user to evaluate the tool on a variety of metrics (confidence in understanding data, confidence in answer, efficiency, ease of use, utility, and overall).
Given a search algorithm (bfs or dfs), an oracle (CompassQL or dziban), and a dataset (birdstrikes or movies), we would like to predict a user’s score for a given metric. In addition, we would like to know if the choice of search algorithm and oracle has any meaningful impact on a user’s ratong for these metrics.
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
confidence_metrics = c("confidence.udata", "confidence.ans")
preference_metrics = c("efficiency", "ease.of.use", "utility", "overall")
user_response_data <- read.csv('split_by_participant_groups/ptask_responses.csv')
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
user_response_data$oracle<- gsub('compassql', 'CompassQL', user_response_data$oracle)
user_response_data$oracle<- gsub('dziban', 'Dziban', user_response_data$oracle)
user_response_data$search<- gsub('bfs', 'BFS', user_response_data$search)
user_response_data$search<- gsub('dfs', 'DFS', user_response_data$search)
user_response_data[,analyses] <- lapply(user_response_data[,analyses],ordered)
user_response_data <- user_response_data %>%
mutate(
dataset = as.factor(dataset),
oracle = as.factor(oracle),
search = as.factor(search),
task = as.factor(task)
)
models <- list()
search_differences <- list()
oracle_differences <- list()
alg_differences <- list()
participant_group_differences <- list()
seed = 12
models$confidence_udata <- brm(
formula = bf(confidence.udata ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/confidence_udata",
seed = seed
)
## Compiling Stan program...
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/Rcpp/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/unsupported" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/src/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppParallel/include/" -I"/Library/Frameworks/R.framework/Versions/4.0/Resources/library/rstan/include" -DEIGEN_NO_DEBUG -DBOOST_DISABLE_ASSERTS -DBOOST_PENDING_INTEGER_LOG2_HPP -DSTAN_THREADS -DBOOST_NO_AUTO_PTR -include '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp' -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1 -I/usr/local/include -fPIC -Wall -g -O2 -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:88:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:1: error: unknown type name 'namespace'
## namespace Eigen {
## ^
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:613:16: error: expected ';' after top level declarator
## namespace Eigen {
## ^
## ;
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/StanHeaders/include/stan/math/prim/mat/fun/Eigen.hpp:13:
## In file included from /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Dense:1:
## /Library/Frameworks/R.framework/Versions/4.0/Resources/library/RcppEigen/include/Eigen/Core:96:10: fatal error: 'complex' file not found
## #include <complex>
## ^~~~~~~~~
## 3 errors generated.
## make: *** [foo.o] Error 1
## Start sampling
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$confidence_udata)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: confidence.udata ~ oracle * search * dataset * task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.22 0.19 0.87 1.63 1.00 554 946
##
## Population-Level Effects:
## Estimate Est.Error
## Intercept[1] -2.03 0.66
## Intercept[2] -0.77 0.65
## Intercept[3] 1.67 0.66
## oracleDziban 0.16 0.87
## searchDFS -0.65 0.84
## datasetmovies -0.09 0.89
## task2.RetrieveValue 0.17 0.57
## task3.Prediction 0.46 0.59
## task4.Exploration 0.74 0.60
## participant_groupstudent 0.47 0.35
## oracleDziban:searchDFS -0.10 1.22
## oracleDziban:datasetmovies 0.78 1.22
## searchDFS:datasetmovies 0.07 1.21
## oracleDziban:task2.RetrieveValue -0.19 0.83
## oracleDziban:task3.Prediction -0.70 0.84
## oracleDziban:task4.Exploration 0.58 0.88
## searchDFS:task2.RetrieveValue 0.31 0.81
## searchDFS:task3.Prediction -0.09 0.84
## searchDFS:task4.Exploration -0.68 0.85
## datasetmovies:task2.RetrieveValue 0.21 0.86
## datasetmovies:task3.Prediction -0.67 0.86
## datasetmovies:task4.Exploration -0.14 0.87
## oracleDziban:searchDFS:datasetmovies 0.21 1.67
## oracleDziban:searchDFS:task2.RetrieveValue 0.49 1.18
## oracleDziban:searchDFS:task3.Prediction 1.76 1.21
## oracleDziban:searchDFS:task4.Exploration 0.46 1.23
## oracleDziban:datasetmovies:task2.RetrieveValue -1.02 1.20
## oracleDziban:datasetmovies:task3.Prediction -0.15 1.21
## oracleDziban:datasetmovies:task4.Exploration -1.80 1.26
## searchDFS:datasetmovies:task2.RetrieveValue 0.19 1.16
## searchDFS:datasetmovies:task3.Prediction 0.63 1.17
## searchDFS:datasetmovies:task4.Exploration 1.18 1.20
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 0.71 1.66
## oracleDziban:searchDFS:datasetmovies:task3.Prediction -0.80 1.72
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 0.92 1.75
## l-95% CI u-95% CI Rhat
## Intercept[1] -3.36 -0.77 1.01
## Intercept[2] -2.06 0.49 1.01
## Intercept[3] 0.45 3.02 1.01
## oracleDziban -1.45 1.89 1.00
## searchDFS -2.31 1.01 1.01
## datasetmovies -1.81 1.70 1.00
## task2.RetrieveValue -0.93 1.29 1.01
## task3.Prediction -0.67 1.56 1.00
## task4.Exploration -0.41 1.87 1.01
## participant_groupstudent -0.20 1.17 1.01
## oracleDziban:searchDFS -2.53 2.26 1.01
## oracleDziban:datasetmovies -1.61 3.19 1.01
## searchDFS:datasetmovies -2.33 2.42 1.01
## oracleDziban:task2.RetrieveValue -1.79 1.48 1.00
## oracleDziban:task3.Prediction -2.31 0.92 1.00
## oracleDziban:task4.Exploration -1.10 2.36 1.01
## searchDFS:task2.RetrieveValue -1.28 1.90 1.01
## searchDFS:task3.Prediction -1.75 1.52 1.00
## searchDFS:task4.Exploration -2.32 0.98 1.01
## datasetmovies:task2.RetrieveValue -1.49 1.80 1.00
## datasetmovies:task3.Prediction -2.36 0.96 1.01
## datasetmovies:task4.Exploration -1.96 1.51 1.01
## oracleDziban:searchDFS:datasetmovies -3.00 3.40 1.01
## oracleDziban:searchDFS:task2.RetrieveValue -1.80 2.78 1.01
## oracleDziban:searchDFS:task3.Prediction -0.63 4.11 1.00
## oracleDziban:searchDFS:task4.Exploration -1.91 2.91 1.01
## oracleDziban:datasetmovies:task2.RetrieveValue -3.38 1.34 1.00
## oracleDziban:datasetmovies:task3.Prediction -2.47 2.26 1.00
## oracleDziban:datasetmovies:task4.Exploration -4.33 0.74 1.01
## searchDFS:datasetmovies:task2.RetrieveValue -2.02 2.45 1.01
## searchDFS:datasetmovies:task3.Prediction -1.64 2.90 1.01
## searchDFS:datasetmovies:task4.Exploration -1.21 3.53 1.01
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue -2.50 3.92 1.01
## oracleDziban:searchDFS:datasetmovies:task3.Prediction -4.13 2.64 1.01
## oracleDziban:searchDFS:datasetmovies:task4.Exploration -2.45 4.35 1.01
## Bulk_ESS Tail_ESS
## Intercept[1] 324 788
## Intercept[2] 308 780
## Intercept[3] 317 718
## oracleDziban 339 591
## searchDFS 336 731
## datasetmovies 348 860
## task2.RetrieveValue 338 725
## task3.Prediction 341 786
## task4.Exploration 372 965
## participant_groupstudent 473 1238
## oracleDziban:searchDFS 361 619
## oracleDziban:datasetmovies 330 728
## searchDFS:datasetmovies 358 893
## oracleDziban:task2.RetrieveValue 314 648
## oracleDziban:task3.Prediction 293 769
## oracleDziban:task4.Exploration 363 885
## searchDFS:task2.RetrieveValue 363 903
## searchDFS:task3.Prediction 316 688
## searchDFS:task4.Exploration 437 873
## datasetmovies:task2.RetrieveValue 404 1073
## datasetmovies:task3.Prediction 373 1180
## datasetmovies:task4.Exploration 453 1083
## oracleDziban:searchDFS:datasetmovies 336 731
## oracleDziban:searchDFS:task2.RetrieveValue 341 992
## oracleDziban:searchDFS:task3.Prediction 268 799
## oracleDziban:searchDFS:task4.Exploration 371 773
## oracleDziban:datasetmovies:task2.RetrieveValue 350 837
## oracleDziban:datasetmovies:task3.Prediction 355 1146
## oracleDziban:datasetmovies:task4.Exploration 409 927
## searchDFS:datasetmovies:task2.RetrieveValue 383 791
## searchDFS:datasetmovies:task3.Prediction 323 966
## searchDFS:datasetmovies:task4.Exploration 421 877
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 344 1039
## oracleDziban:searchDFS:datasetmovies:task3.Prediction 296 874
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 382 842
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$confidence_udata)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$confidence_udata,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$confidence_udata,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for confidence in understanding the data using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
confidence_udata_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_udata, NULL, "Oracle/Search Combination", "Rating")
confidence_udata_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
confidence_udata_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 1.00 0.672 1.31 0.95 mean qi
## 2 BFS Dziban 1.04 0.691 1.34 0.95 mean qi
## 3 DFS CompassQL 0.785 0.426 1.10 0.95 mean qi
## 4 DFS Dziban 1.17 0.859 1.47 0.95 mean qi
## 5 BFS CompassQL 1.00 0.906 1.11 0.5 mean qi
## 6 BFS Dziban 1.04 0.926 1.15 0.5 mean qi
## 7 DFS CompassQL 0.785 0.676 0.897 0.5 mean qi
## 8 DFS Dziban 1.17 1.06 1.28 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
confidence_udata_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_udata, seed = seed, re_formula = NA)
confidence_udata_predictive_data$alg <- paste(confidence_udata_predictive_data$search, confidence_udata_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "search", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$confidence_udata$plot
Differences in user score by oracle.
oracle_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "oracle", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$confidence_udata$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
confidence_udata_predictive_data_subset <- subset(confidence_udata_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data_subset, "alg", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$confidence_udata$plot
Differences in user score by participant group
participant_group_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "participant_group", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$confidence_udata$plot
models$confidence_ans <- brm(
formula = bf(confidence.ans ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/confidence_ans",
seed = seed
)
## Compiling Stan program...
## Start sampling
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$confidence_ans)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: confidence.ans ~ oracle * search * dataset * task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.63 0.14 0.36 0.91 1.00 989 1536
##
## Population-Level Effects:
## Estimate Est.Error
## Intercept[1] -3.55 0.59
## Intercept[2] -2.70 0.54
## Intercept[3] -1.82 0.53
## Intercept[4] 0.08 0.51
## oracleDziban 0.02 0.70
## searchDFS -0.21 0.68
## datasetmovies -0.79 0.66
## task2.RetrieveValue 0.07 0.63
## task3.Prediction -1.62 0.59
## task4.Exploration -0.79 0.60
## participant_groupstudent 0.22 0.23
## oracleDziban:searchDFS -0.30 0.96
## oracleDziban:datasetmovies 0.98 0.96
## searchDFS:datasetmovies 1.08 0.95
## oracleDziban:task2.RetrieveValue -0.10 0.88
## oracleDziban:task3.Prediction 0.84 0.83
## oracleDziban:task4.Exploration 0.51 0.84
## searchDFS:task2.RetrieveValue 0.48 0.87
## searchDFS:task3.Prediction 0.43 0.79
## searchDFS:task4.Exploration -0.25 0.83
## datasetmovies:task2.RetrieveValue -0.06 0.86
## datasetmovies:task3.Prediction 0.81 0.78
## datasetmovies:task4.Exploration 0.22 0.83
## oracleDziban:searchDFS:datasetmovies -0.78 1.35
## oracleDziban:searchDFS:task2.RetrieveValue -0.96 1.19
## oracleDziban:searchDFS:task3.Prediction -0.31 1.14
## oracleDziban:searchDFS:task4.Exploration 0.25 1.17
## oracleDziban:datasetmovies:task2.RetrieveValue -1.19 1.21
## oracleDziban:datasetmovies:task3.Prediction -1.89 1.14
## oracleDziban:datasetmovies:task4.Exploration -1.22 1.16
## searchDFS:datasetmovies:task2.RetrieveValue -1.60 1.18
## searchDFS:datasetmovies:task3.Prediction -1.09 1.12
## searchDFS:datasetmovies:task4.Exploration -0.07 1.16
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 3.48 1.69
## oracleDziban:searchDFS:datasetmovies:task3.Prediction 2.39 1.63
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 1.20 1.63
## l-95% CI u-95% CI Rhat
## Intercept[1] -4.75 -2.44 1.00
## Intercept[2] -3.81 -1.64 1.00
## Intercept[3] -2.89 -0.80 1.00
## Intercept[4] -0.96 1.06 1.00
## oracleDziban -1.35 1.40 1.00
## searchDFS -1.56 1.17 1.00
## datasetmovies -2.10 0.50 1.00
## task2.RetrieveValue -1.18 1.31 1.01
## task3.Prediction -2.81 -0.48 1.00
## task4.Exploration -1.96 0.37 1.00
## participant_groupstudent -0.22 0.67 1.00
## oracleDziban:searchDFS -2.21 1.64 1.00
## oracleDziban:datasetmovies -0.95 2.86 1.00
## searchDFS:datasetmovies -0.79 2.94 1.00
## oracleDziban:task2.RetrieveValue -1.83 1.62 1.01
## oracleDziban:task3.Prediction -0.69 2.53 1.00
## oracleDziban:task4.Exploration -1.15 2.17 1.00
## searchDFS:task2.RetrieveValue -1.23 2.24 1.01
## searchDFS:task3.Prediction -1.14 2.00 1.00
## searchDFS:task4.Exploration -1.91 1.42 1.00
## datasetmovies:task2.RetrieveValue -1.70 1.62 1.00
## datasetmovies:task3.Prediction -0.73 2.35 1.00
## datasetmovies:task4.Exploration -1.42 1.79 1.00
## oracleDziban:searchDFS:datasetmovies -3.48 1.86 1.00
## oracleDziban:searchDFS:task2.RetrieveValue -3.34 1.46 1.01
## oracleDziban:searchDFS:task3.Prediction -2.54 1.93 1.00
## oracleDziban:searchDFS:task4.Exploration -2.00 2.57 1.00
## oracleDziban:datasetmovies:task2.RetrieveValue -3.55 1.12 1.00
## oracleDziban:datasetmovies:task3.Prediction -4.13 0.30 1.00
## oracleDziban:datasetmovies:task4.Exploration -3.40 1.09 1.00
## searchDFS:datasetmovies:task2.RetrieveValue -3.93 0.66 1.00
## searchDFS:datasetmovies:task3.Prediction -3.24 1.20 1.00
## searchDFS:datasetmovies:task4.Exploration -2.37 2.24 1.00
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 0.24 6.90 1.00
## oracleDziban:searchDFS:datasetmovies:task3.Prediction -0.77 5.62 1.00
## oracleDziban:searchDFS:datasetmovies:task4.Exploration -1.93 4.32 1.00
## Bulk_ESS Tail_ESS
## Intercept[1] 700 1474
## Intercept[2] 616 1073
## Intercept[3] 591 1167
## Intercept[4] 602 1254
## oracleDziban 558 1001
## searchDFS 543 881
## datasetmovies 550 1151
## task2.RetrieveValue 652 1104
## task3.Prediction 666 954
## task4.Exploration 691 1092
## participant_groupstudent 1875 2000
## oracleDziban:searchDFS 549 1062
## oracleDziban:datasetmovies 555 1048
## searchDFS:datasetmovies 540 812
## oracleDziban:task2.RetrieveValue 615 1268
## oracleDziban:task3.Prediction 695 1125
## oracleDziban:task4.Exploration 702 1111
## searchDFS:task2.RetrieveValue 628 850
## searchDFS:task3.Prediction 703 1004
## searchDFS:task4.Exploration 674 1063
## datasetmovies:task2.RetrieveValue 670 1037
## datasetmovies:task3.Prediction 642 860
## datasetmovies:task4.Exploration 685 1083
## oracleDziban:searchDFS:datasetmovies 560 1190
## oracleDziban:searchDFS:task2.RetrieveValue 631 985
## oracleDziban:searchDFS:task3.Prediction 790 1183
## oracleDziban:searchDFS:task4.Exploration 740 1034
## oracleDziban:datasetmovies:task2.RetrieveValue 664 1340
## oracleDziban:datasetmovies:task3.Prediction 648 1283
## oracleDziban:datasetmovies:task4.Exploration 715 1046
## searchDFS:datasetmovies:task2.RetrieveValue 663 1033
## searchDFS:datasetmovies:task3.Prediction 737 1137
## searchDFS:datasetmovies:task4.Exploration 710 1218
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 702 1325
## oracleDziban:searchDFS:datasetmovies:task3.Prediction 793 1129
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 729 1256
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$confidence_ans)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$confidence_ans,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$confidence_ans,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for confidence in answer using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
confidence_ans_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_ans, NULL, "Oracle/Search Combination", "Rating")
confidence_ans_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
confidence_ans_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 0.992 0.656 1.27 0.95 mean qi
## 2 BFS Dziban 1.14 0.838 1.40 0.95 mean qi
## 3 DFS CompassQL 1.07 0.765 1.34 0.95 mean qi
## 4 DFS Dziban 1.22 0.937 1.48 0.95 mean qi
## 5 BFS CompassQL 0.992 0.891 1.09 0.5 mean qi
## 6 BFS Dziban 1.14 1.04 1.24 0.5 mean qi
## 7 DFS CompassQL 1.07 0.971 1.18 0.5 mean qi
## 8 DFS Dziban 1.22 1.12 1.31 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
confidence_ans_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_ans, seed = seed, re_formula = NA)
confidence_ans_predictive_data$alg <- paste(confidence_ans_predictive_data$search, confidence_ans_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "search", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$confidence_ans$plot
Differences in user score by oracle.
oracle_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "oracle", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$confidence_ans$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
confidence_ans_predictive_data_subset <- subset(confidence_ans_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data_subset, "alg", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$confidence_ans$plot
Differences in user score by participant group
participant_group_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "participant_group", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$confidence_ans$plot
filename = "efficiency"
models$efficiency <- brm(
formula = bf(efficiency ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/efficiency",
seed = seed
)
## Compiling Stan program...
## Start sampling
## Warning: There were 4 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$efficiency)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: efficiency ~ oracle * search * dataset * task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.23 0.18 0.91 1.62 1.00 762 1467
##
## Population-Level Effects:
## Estimate Est.Error
## Intercept[1] -3.28 0.67
## Intercept[2] -1.57 0.63
## Intercept[3] -0.65 0.63
## Intercept[4] 0.63 0.63
## oracleDziban -0.44 0.80
## searchDFS -2.49 0.83
## datasetmovies -0.33 0.84
## task2.RetrieveValue -0.81 0.55
## task3.Prediction -0.25 0.57
## task4.Exploration 0.03 0.56
## participant_groupstudent 0.31 0.35
## oracleDziban:searchDFS 1.75 1.16
## oracleDziban:datasetmovies -0.78 1.13
## searchDFS:datasetmovies 1.42 1.15
## oracleDziban:task2.RetrieveValue 0.67 0.77
## oracleDziban:task3.Prediction 0.73 0.78
## oracleDziban:task4.Exploration 0.51 0.77
## searchDFS:task2.RetrieveValue 0.50 0.81
## searchDFS:task3.Prediction 0.37 0.83
## searchDFS:task4.Exploration 0.75 0.81
## datasetmovies:task2.RetrieveValue 0.10 0.78
## datasetmovies:task3.Prediction -0.15 0.79
## datasetmovies:task4.Exploration -0.19 0.80
## oracleDziban:searchDFS:datasetmovies 0.02 1.60
## oracleDziban:searchDFS:task2.RetrieveValue -1.20 1.12
## oracleDziban:searchDFS:task3.Prediction -0.93 1.11
## oracleDziban:searchDFS:task4.Exploration -1.52 1.10
## oracleDziban:datasetmovies:task2.RetrieveValue 0.23 1.09
## oracleDziban:datasetmovies:task3.Prediction 1.09 1.09
## oracleDziban:datasetmovies:task4.Exploration 0.79 1.10
## searchDFS:datasetmovies:task2.RetrieveValue 0.36 1.10
## searchDFS:datasetmovies:task3.Prediction 0.55 1.13
## searchDFS:datasetmovies:task4.Exploration -0.10 1.12
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 0.13 1.53
## oracleDziban:searchDFS:datasetmovies:task3.Prediction -1.13 1.56
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 1.01 1.55
## l-95% CI u-95% CI Rhat
## Intercept[1] -4.63 -2.01 1.00
## Intercept[2] -2.81 -0.35 1.00
## Intercept[3] -1.86 0.55 1.00
## Intercept[4] -0.62 1.87 1.00
## oracleDziban -2.10 1.17 1.01
## searchDFS -4.16 -0.94 1.00
## datasetmovies -1.96 1.33 1.00
## task2.RetrieveValue -1.93 0.23 1.01
## task3.Prediction -1.39 0.87 1.00
## task4.Exploration -1.10 1.08 1.00
## participant_groupstudent -0.36 1.00 1.00
## oracleDziban:searchDFS -0.48 4.02 1.01
## oracleDziban:datasetmovies -2.99 1.42 1.00
## searchDFS:datasetmovies -0.81 3.62 1.00
## oracleDziban:task2.RetrieveValue -0.84 2.18 1.01
## oracleDziban:task3.Prediction -0.76 2.29 1.01
## oracleDziban:task4.Exploration -0.98 1.99 1.01
## searchDFS:task2.RetrieveValue -1.02 2.14 1.01
## searchDFS:task3.Prediction -1.24 2.05 1.00
## searchDFS:task4.Exploration -0.82 2.35 1.00
## datasetmovies:task2.RetrieveValue -1.40 1.67 1.00
## datasetmovies:task3.Prediction -1.68 1.41 1.00
## datasetmovies:task4.Exploration -1.78 1.39 1.00
## oracleDziban:searchDFS:datasetmovies -3.08 3.16 1.00
## oracleDziban:searchDFS:task2.RetrieveValue -3.45 0.91 1.01
## oracleDziban:searchDFS:task3.Prediction -3.06 1.17 1.01
## oracleDziban:searchDFS:task4.Exploration -3.70 0.68 1.01
## oracleDziban:datasetmovies:task2.RetrieveValue -1.85 2.37 1.01
## oracleDziban:datasetmovies:task3.Prediction -1.02 3.17 1.00
## oracleDziban:datasetmovies:task4.Exploration -1.35 2.93 1.01
## searchDFS:datasetmovies:task2.RetrieveValue -1.82 2.49 1.00
## searchDFS:datasetmovies:task3.Prediction -1.68 2.71 1.00
## searchDFS:datasetmovies:task4.Exploration -2.27 2.01 1.00
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue -2.82 3.12 1.01
## oracleDziban:searchDFS:datasetmovies:task3.Prediction -4.22 1.79 1.00
## oracleDziban:searchDFS:datasetmovies:task4.Exploration -2.08 4.06 1.01
## Bulk_ESS Tail_ESS
## Intercept[1] 681 1130
## Intercept[2] 645 973
## Intercept[3] 640 992
## Intercept[4] 627 953
## oracleDziban 565 959
## searchDFS 553 774
## datasetmovies 508 824
## task2.RetrieveValue 720 1364
## task3.Prediction 682 1011
## task4.Exploration 760 1407
## participant_groupstudent 827 1334
## oracleDziban:searchDFS 480 1047
## oracleDziban:datasetmovies 498 706
## searchDFS:datasetmovies 450 904
## oracleDziban:task2.RetrieveValue 698 1200
## oracleDziban:task3.Prediction 724 1197
## oracleDziban:task4.Exploration 771 1432
## searchDFS:task2.RetrieveValue 634 991
## searchDFS:task3.Prediction 654 1225
## searchDFS:task4.Exploration 687 961
## datasetmovies:task2.RetrieveValue 622 1295
## datasetmovies:task3.Prediction 657 977
## datasetmovies:task4.Exploration 717 1237
## oracleDziban:searchDFS:datasetmovies 427 993
## oracleDziban:searchDFS:task2.RetrieveValue 649 1065
## oracleDziban:searchDFS:task3.Prediction 670 1185
## oracleDziban:searchDFS:task4.Exploration 697 1181
## oracleDziban:datasetmovies:task2.RetrieveValue 667 1309
## oracleDziban:datasetmovies:task3.Prediction 678 1200
## oracleDziban:datasetmovies:task4.Exploration 718 1500
## searchDFS:datasetmovies:task2.RetrieveValue 586 987
## searchDFS:datasetmovies:task3.Prediction 647 1092
## searchDFS:datasetmovies:task4.Exploration 655 1137
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 615 1296
## oracleDziban:searchDFS:datasetmovies:task3.Prediction 657 1421
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 687 1199
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$efficiency)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$efficiency,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$efficiency,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for efficiency using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
efficiency_plot <- user_response_posterior_draws_plot(user_response_data, models$efficiency, NULL, "Oracle/Search Combination", "Rating")
efficiency_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
efficiency_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 0.705 0.125 1.23 0.95 mean qi
## 2 BFS Dziban 0.608 0.0732 1.09 0.95 mean qi
## 3 DFS CompassQL -0.249 -0.779 0.294 0.95 mean qi
## 4 DFS Dziban 0.293 -0.234 0.828 0.95 mean qi
## 5 BFS CompassQL 0.705 0.516 0.906 0.5 mean qi
## 6 BFS Dziban 0.608 0.426 0.809 0.5 mean qi
## 7 DFS CompassQL -0.249 -0.426 -0.0588 0.5 mean qi
## 8 DFS Dziban 0.293 0.109 0.484 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
efficiency_predictive_data <- user_response_data %>% add_predicted_draws(models$efficiency, seed = seed, re_formula = NA)
efficiency_predictive_data$alg <- paste(efficiency_predictive_data$search, efficiency_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "search", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$efficiency$plot
Differences in user score by oracle.
oracle_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "oracle", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$efficiency$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
efficiency_predictive_data_data_subset <- subset(efficiency_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data_data_subset, "alg", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$efficiency$plot
Differences in user score by participant group
participant_group_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "participant_group", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$efficiency$plot
models$ease_of_use <- brm(
formula = bf(ease.of.use ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/ease_of_use",
seed = seed
)
## Compiling Stan program...
## Start sampling
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$ease_of_use)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: ease.of.use ~ oracle * search * dataset * task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.13 0.16 0.83 1.47 1.01 831 1624
##
## Population-Level Effects:
## Estimate Est.Error
## Intercept[1] -2.75 0.66
## Intercept[2] -1.23 0.61
## Intercept[3] -0.22 0.61
## Intercept[4] 1.82 0.63
## oracleDziban -0.28 0.79
## searchDFS -1.66 0.80
## datasetmovies -0.51 0.80
## task2.RetrieveValue 0.76 0.59
## task3.Prediction 0.58 0.60
## task4.Exploration 1.35 0.63
## participant_groupstudent 0.55 0.33
## oracleDziban:searchDFS 1.01 1.09
## oracleDziban:datasetmovies 1.25 1.11
## searchDFS:datasetmovies 1.67 1.09
## oracleDziban:task2.RetrieveValue -0.92 0.79
## oracleDziban:task3.Prediction 0.19 0.81
## oracleDziban:task4.Exploration -0.87 0.83
## searchDFS:task2.RetrieveValue -0.03 0.81
## searchDFS:task3.Prediction -0.75 0.81
## searchDFS:task4.Exploration -1.32 0.82
## datasetmovies:task2.RetrieveValue 0.24 0.82
## datasetmovies:task3.Prediction 0.69 0.84
## datasetmovies:task4.Exploration -0.47 0.84
## oracleDziban:searchDFS:datasetmovies -1.90 1.56
## oracleDziban:searchDFS:task2.RetrieveValue -0.13 1.11
## oracleDziban:searchDFS:task3.Prediction -0.08 1.14
## oracleDziban:searchDFS:task4.Exploration 0.78 1.14
## oracleDziban:datasetmovies:task2.RetrieveValue -0.46 1.10
## oracleDziban:datasetmovies:task3.Prediction -2.03 1.15
## oracleDziban:datasetmovies:task4.Exploration -0.52 1.14
## searchDFS:datasetmovies:task2.RetrieveValue -1.17 1.12
## searchDFS:datasetmovies:task3.Prediction -0.00 1.12
## searchDFS:datasetmovies:task4.Exploration 0.71 1.10
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 2.10 1.57
## oracleDziban:searchDFS:datasetmovies:task3.Prediction 2.12 1.59
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 1.87 1.60
## l-95% CI u-95% CI Rhat
## Intercept[1] -4.07 -1.48 1.00
## Intercept[2] -2.46 -0.04 1.00
## Intercept[3] -1.45 0.95 1.00
## Intercept[4] 0.61 3.04 1.00
## oracleDziban -1.85 1.22 1.00
## searchDFS -3.17 -0.03 1.00
## datasetmovies -2.05 1.07 1.00
## task2.RetrieveValue -0.42 1.91 1.00
## task3.Prediction -0.56 1.78 1.00
## task4.Exploration 0.14 2.63 1.01
## participant_groupstudent -0.07 1.21 1.00
## oracleDziban:searchDFS -1.13 3.10 1.00
## oracleDziban:datasetmovies -0.90 3.47 1.00
## searchDFS:datasetmovies -0.51 3.89 1.00
## oracleDziban:task2.RetrieveValue -2.46 0.61 1.00
## oracleDziban:task3.Prediction -1.38 1.81 1.00
## oracleDziban:task4.Exploration -2.50 0.72 1.00
## searchDFS:task2.RetrieveValue -1.60 1.53 1.00
## searchDFS:task3.Prediction -2.38 0.85 1.00
## searchDFS:task4.Exploration -2.93 0.26 1.00
## datasetmovies:task2.RetrieveValue -1.38 1.84 1.00
## datasetmovies:task3.Prediction -1.03 2.28 1.00
## datasetmovies:task4.Exploration -2.15 1.14 1.00
## oracleDziban:searchDFS:datasetmovies -5.00 1.12 1.00
## oracleDziban:searchDFS:task2.RetrieveValue -2.30 2.01 1.00
## oracleDziban:searchDFS:task3.Prediction -2.37 2.15 1.00
## oracleDziban:searchDFS:task4.Exploration -1.46 2.98 1.00
## oracleDziban:datasetmovies:task2.RetrieveValue -2.58 1.80 1.00
## oracleDziban:datasetmovies:task3.Prediction -4.27 0.27 1.00
## oracleDziban:datasetmovies:task4.Exploration -2.67 1.69 1.00
## searchDFS:datasetmovies:task2.RetrieveValue -3.40 1.02 1.00
## searchDFS:datasetmovies:task3.Prediction -2.21 2.19 1.01
## searchDFS:datasetmovies:task4.Exploration -1.43 2.84 1.00
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue -0.88 5.19 1.00
## oracleDziban:searchDFS:datasetmovies:task3.Prediction -0.98 5.34 1.01
## oracleDziban:searchDFS:datasetmovies:task4.Exploration -1.25 4.96 1.00
## Bulk_ESS Tail_ESS
## Intercept[1] 615 963
## Intercept[2] 602 1115
## Intercept[3] 607 1032
## Intercept[4] 624 1183
## oracleDziban 538 1141
## searchDFS 538 866
## datasetmovies 544 880
## task2.RetrieveValue 537 1389
## task3.Prediction 550 1106
## task4.Exploration 673 1423
## participant_groupstudent 1078 1408
## oracleDziban:searchDFS 590 1045
## oracleDziban:datasetmovies 590 1137
## searchDFS:datasetmovies 532 918
## oracleDziban:task2.RetrieveValue 586 1436
## oracleDziban:task3.Prediction 516 1555
## oracleDziban:task4.Exploration 691 1497
## searchDFS:task2.RetrieveValue 641 1263
## searchDFS:task3.Prediction 609 887
## searchDFS:task4.Exploration 707 1759
## datasetmovies:task2.RetrieveValue 545 1273
## datasetmovies:task3.Prediction 528 1240
## datasetmovies:task4.Exploration 623 1381
## oracleDziban:searchDFS:datasetmovies 564 1273
## oracleDziban:searchDFS:task2.RetrieveValue 725 1351
## oracleDziban:searchDFS:task3.Prediction 643 1391
## oracleDziban:searchDFS:task4.Exploration 830 1694
## oracleDziban:datasetmovies:task2.RetrieveValue 589 1407
## oracleDziban:datasetmovies:task3.Prediction 504 1014
## oracleDziban:datasetmovies:task4.Exploration 717 1627
## searchDFS:datasetmovies:task2.RetrieveValue 652 1469
## searchDFS:datasetmovies:task3.Prediction 545 981
## searchDFS:datasetmovies:task4.Exploration 619 1364
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 805 1534
## oracleDziban:searchDFS:datasetmovies:task3.Prediction 591 1623
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 850 1774
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$ease_of_use)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$ease_of_use,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$ease_of_use,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for ease of use using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
ease_of_use_plot <- user_response_posterior_draws_plot(user_response_data, models$ease_of_use, NULL, "Oracle/Search Combination", "Rating")
ease_of_use_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
ease_of_use_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 0.943 0.547 1.30 0.95 mean qi
## 2 BFS Dziban 0.704 0.265 1.09 0.95 mean qi
## 3 DFS CompassQL 0.0674 -0.412 0.5 0.95 mean qi
## 4 DFS Dziban 0.407 -0.0312 0.812 0.95 mean qi
## 5 BFS CompassQL 0.943 0.812 1.08 0.5 mean qi
## 6 BFS Dziban 0.704 0.574 0.838 0.5 mean qi
## 7 DFS CompassQL 0.0674 -0.0882 0.235 0.5 mean qi
## 8 DFS Dziban 0.407 0.25 0.562 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
ease_of_use_predictive_data <- user_response_data %>% add_predicted_draws(models$ease_of_use, seed = seed, re_formula = NA)
ease_of_use_predictive_data$alg <- paste(ease_of_use_predictive_data$search, ease_of_use_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "search", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$ease_of_use$plot
Differences in user score by oracle.
oracle_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "oracle", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$ease_of_use$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
ease_of_use_predictive_data_subset <- subset(ease_of_use_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data_subset, "alg", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$ease_of_use$plot
Differences in user score by participant group
participant_group_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "participant_group", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$ease_of_use$plot
models$utility <- brm(
formula = bf(utility ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/utility",
seed = seed
)
## Compiling Stan program...
## Start sampling
## Warning: There were 1 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$utility)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: utility ~ oracle * search * dataset * task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.06 0.16 0.77 1.41 1.00 787 1539
##
## Population-Level Effects:
## Estimate Est.Error
## Intercept[1] -2.26 0.63
## Intercept[2] -0.89 0.60
## Intercept[3] -0.20 0.60
## Intercept[4] 1.18 0.60
## oracleDziban 0.31 0.77
## searchDFS -1.85 0.81
## datasetmovies -0.13 0.79
## task2.RetrieveValue -0.44 0.54
## task3.Prediction -0.06 0.55
## task4.Exploration 0.41 0.56
## participant_groupstudent 0.31 0.30
## oracleDziban:searchDFS 0.65 1.12
## oracleDziban:datasetmovies -1.00 1.07
## searchDFS:datasetmovies 1.23 1.09
## oracleDziban:task2.RetrieveValue -0.16 0.76
## oracleDziban:task3.Prediction 0.06 0.77
## oracleDziban:task4.Exploration -0.48 0.78
## searchDFS:task2.RetrieveValue 0.33 0.81
## searchDFS:task3.Prediction 0.25 0.79
## searchDFS:task4.Exploration 0.68 0.81
## datasetmovies:task2.RetrieveValue -0.68 0.78
## datasetmovies:task3.Prediction 0.36 0.78
## datasetmovies:task4.Exploration 0.19 0.79
## oracleDziban:searchDFS:datasetmovies 0.24 1.54
## oracleDziban:searchDFS:task2.RetrieveValue -0.01 1.12
## oracleDziban:searchDFS:task3.Prediction -0.25 1.12
## oracleDziban:searchDFS:task4.Exploration -0.34 1.14
## oracleDziban:datasetmovies:task2.RetrieveValue 1.44 1.09
## oracleDziban:datasetmovies:task3.Prediction 0.61 1.10
## oracleDziban:datasetmovies:task4.Exploration 1.37 1.10
## searchDFS:datasetmovies:task2.RetrieveValue 0.98 1.09
## searchDFS:datasetmovies:task3.Prediction 0.10 1.07
## searchDFS:datasetmovies:task4.Exploration -0.82 1.10
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue -0.71 1.57
## oracleDziban:searchDFS:datasetmovies:task3.Prediction -0.29 1.55
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 0.23 1.57
## l-95% CI u-95% CI Rhat
## Intercept[1] -3.48 -1.09 1.00
## Intercept[2] -2.08 0.23 1.00
## Intercept[3] -1.38 0.92 1.00
## Intercept[4] -0.01 2.31 1.00
## oracleDziban -1.28 1.75 1.01
## searchDFS -3.51 -0.37 1.00
## datasetmovies -1.68 1.42 1.00
## task2.RetrieveValue -1.46 0.60 1.01
## task3.Prediction -1.13 1.02 1.01
## task4.Exploration -0.68 1.49 1.01
## participant_groupstudent -0.29 0.91 1.00
## oracleDziban:searchDFS -1.51 2.89 1.00
## oracleDziban:datasetmovies -3.06 1.17 1.01
## searchDFS:datasetmovies -0.89 3.36 1.00
## oracleDziban:task2.RetrieveValue -1.67 1.30 1.00
## oracleDziban:task3.Prediction -1.43 1.54 1.00
## oracleDziban:task4.Exploration -1.96 1.04 1.01
## searchDFS:task2.RetrieveValue -1.25 1.89 1.01
## searchDFS:task3.Prediction -1.31 1.77 1.01
## searchDFS:task4.Exploration -0.90 2.28 1.01
## datasetmovies:task2.RetrieveValue -2.23 0.85 1.00
## datasetmovies:task3.Prediction -1.18 1.85 1.01
## datasetmovies:task4.Exploration -1.35 1.74 1.00
## oracleDziban:searchDFS:datasetmovies -2.76 3.25 1.00
## oracleDziban:searchDFS:task2.RetrieveValue -2.20 2.17 1.01
## oracleDziban:searchDFS:task3.Prediction -2.35 1.95 1.01
## oracleDziban:searchDFS:task4.Exploration -2.53 1.87 1.01
## oracleDziban:datasetmovies:task2.RetrieveValue -0.66 3.62 1.00
## oracleDziban:datasetmovies:task3.Prediction -1.51 2.80 1.00
## oracleDziban:datasetmovies:task4.Exploration -0.74 3.51 1.00
## searchDFS:datasetmovies:task2.RetrieveValue -1.07 3.17 1.00
## searchDFS:datasetmovies:task3.Prediction -1.94 2.18 1.01
## searchDFS:datasetmovies:task4.Exploration -3.00 1.42 1.01
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue -3.76 2.22 1.00
## oracleDziban:searchDFS:datasetmovies:task3.Prediction -3.39 2.65 1.00
## oracleDziban:searchDFS:datasetmovies:task4.Exploration -2.86 3.35 1.01
## Bulk_ESS Tail_ESS
## Intercept[1] 452 929
## Intercept[2] 459 869
## Intercept[3] 474 969
## Intercept[4] 481 1043
## oracleDziban 436 854
## searchDFS 424 878
## datasetmovies 452 862
## task2.RetrieveValue 481 1197
## task3.Prediction 443 1201
## task4.Exploration 425 1087
## participant_groupstudent 751 1167
## oracleDziban:searchDFS 396 724
## oracleDziban:datasetmovies 424 832
## searchDFS:datasetmovies 431 824
## oracleDziban:task2.RetrieveValue 471 1078
## oracleDziban:task3.Prediction 447 1202
## oracleDziban:task4.Exploration 408 1114
## searchDFS:task2.RetrieveValue 445 992
## searchDFS:task3.Prediction 379 758
## searchDFS:task4.Exploration 386 1010
## datasetmovies:task2.RetrieveValue 565 1065
## datasetmovies:task3.Prediction 508 1428
## datasetmovies:task4.Exploration 476 1054
## oracleDziban:searchDFS:datasetmovies 400 726
## oracleDziban:searchDFS:task2.RetrieveValue 422 807
## oracleDziban:searchDFS:task3.Prediction 386 1009
## oracleDziban:searchDFS:task4.Exploration 337 853
## oracleDziban:datasetmovies:task2.RetrieveValue 554 1210
## oracleDziban:datasetmovies:task3.Prediction 511 1247
## oracleDziban:datasetmovies:task4.Exploration 461 1241
## searchDFS:datasetmovies:task2.RetrieveValue 483 787
## searchDFS:datasetmovies:task3.Prediction 429 1001
## searchDFS:datasetmovies:task4.Exploration 429 838
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 480 781
## oracleDziban:searchDFS:datasetmovies:task3.Prediction 429 868
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 397 997
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$utility)
s plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$utility,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$utility,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for Utility using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
utility_plot <- user_response_posterior_draws_plot(user_response_data, models$utility, NULL, "Oracle/Search Combination", "Rating")
utility_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
utility_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 0.535 -0.0156 1.05 0.95 mean qi
## 2 BFS Dziban 0.596 0.0441 1.09 0.95 mean qi
## 3 DFS CompassQL -0.187 -0.706 0.353 0.95 mean qi
## 4 DFS Dziban 0.301 -0.25 0.812 0.95 mean qi
## 5 BFS CompassQL 0.535 0.359 0.719 0.5 mean qi
## 6 BFS Dziban 0.596 0.426 0.765 0.5 mean qi
## 7 DFS CompassQL -0.187 -0.382 0 0.5 mean qi
## 8 DFS Dziban 0.301 0.125 0.484 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
utility_predictive_data <- user_response_data %>% add_predicted_draws(models$utility, seed = seed, re_formula = NA)
utility_predictive_data$alg <- paste(utility_predictive_data$search, utility_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$utility <- user_response_diff_plot(utility_predictive_data, "search", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$utility$plot
Differences in user score by oracle.
oracle_differences$utility <- user_response_diff_plot(utility_predictive_data, "oracle", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$utility$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
utility_predictive_data_subset <- subset(utility_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$utility <- user_response_diff_plot(utility_predictive_data_subset, "alg", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$utility$plot
Differences in user score by participant group
participant_group_differences$utility <- user_response_diff_plot(utility_predictive_data, "participant_group", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$utility$plot
models$overall <- brm(
formula = bf(overall ~ oracle * search * dataset * task + participant_group + (1 | participant_id)),
family = cumulative("probit"),
prior = prior(normal(0.26, 1.26), class = Intercept),
chains = 2,
cores = 2,
iter = 2500,
warmup = 1000,
data = user_response_data,
control = list(adapt_delta = 0.99),
file = "models/overall",
seed = seed
)
## Compiling Stan program...
## Start sampling
## Warning: There were 18 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
## http://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded
## Warning: Examine the pairs() plot to diagnose sampling problems
Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.
summary(models$overall)
## Family: cumulative
## Links: mu = probit; disc = identity
## Formula: overall ~ oracle * search * dataset * task + participant_group + (1 | participant_id)
## Data: user_response_data (Number of observations: 264)
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
## total post-warmup samples = 3000
##
## Group-Level Effects:
## ~participant_id (Number of levels: 66)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 1.69 0.22 1.29 2.16 1.00 846 1336
##
## Population-Level Effects:
## Estimate Est.Error
## Intercept[1] -3.20 0.85
## Intercept[2] -1.66 0.82
## Intercept[3] -0.24 0.81
## Intercept[4] 2.24 0.83
## oracleDziban -0.81 1.04
## searchDFS -2.02 1.08
## datasetmovies -0.83 1.05
## task2.RetrieveValue 0.01 0.63
## task3.Prediction 0.41 0.62
## task4.Exploration 1.18 0.66
## participant_groupstudent 0.78 0.47
## oracleDziban:searchDFS 2.04 1.52
## oracleDziban:datasetmovies 1.35 1.45
## searchDFS:datasetmovies 2.24 1.54
## oracleDziban:task2.RetrieveValue 0.53 0.89
## oracleDziban:task3.Prediction 1.69 0.90
## oracleDziban:task4.Exploration 1.20 0.91
## searchDFS:task2.RetrieveValue 0.90 0.87
## searchDFS:task3.Prediction 0.48 0.85
## searchDFS:task4.Exploration -0.75 0.89
## datasetmovies:task2.RetrieveValue 0.28 0.85
## datasetmovies:task3.Prediction -0.30 0.84
## datasetmovies:task4.Exploration -0.59 0.86
## oracleDziban:searchDFS:datasetmovies -2.27 2.12
## oracleDziban:searchDFS:task2.RetrieveValue -2.65 1.21
## oracleDziban:searchDFS:task3.Prediction -1.82 1.21
## oracleDziban:searchDFS:task4.Exploration -1.13 1.23
## oracleDziban:datasetmovies:task2.RetrieveValue -1.31 1.22
## oracleDziban:datasetmovies:task3.Prediction -1.48 1.20
## oracleDziban:datasetmovies:task4.Exploration -1.94 1.23
## searchDFS:datasetmovies:task2.RetrieveValue -1.50 1.18
## searchDFS:datasetmovies:task3.Prediction -0.59 1.15
## searchDFS:datasetmovies:task4.Exploration 0.37 1.19
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 4.00 1.70
## oracleDziban:searchDFS:datasetmovies:task3.Prediction 1.35 1.65
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 3.14 1.71
## l-95% CI u-95% CI Rhat
## Intercept[1] -4.90 -1.56 1.00
## Intercept[2] -3.34 -0.05 1.00
## Intercept[3] -1.90 1.32 1.00
## Intercept[4] 0.58 3.89 1.00
## oracleDziban -2.82 1.20 1.00
## searchDFS -4.20 0.05 1.00
## datasetmovies -2.86 1.32 1.00
## task2.RetrieveValue -1.22 1.24 1.00
## task3.Prediction -0.78 1.64 1.00
## task4.Exploration -0.07 2.53 1.00
## participant_groupstudent -0.15 1.69 1.00
## oracleDziban:searchDFS -0.91 4.98 1.00
## oracleDziban:datasetmovies -1.58 4.23 1.00
## searchDFS:datasetmovies -0.88 5.26 1.00
## oracleDziban:task2.RetrieveValue -1.21 2.29 1.00
## oracleDziban:task3.Prediction -0.14 3.40 1.00
## oracleDziban:task4.Exploration -0.68 2.96 1.00
## searchDFS:task2.RetrieveValue -0.80 2.59 1.00
## searchDFS:task3.Prediction -1.19 2.15 1.00
## searchDFS:task4.Exploration -2.52 0.97 1.00
## datasetmovies:task2.RetrieveValue -1.32 1.97 1.00
## datasetmovies:task3.Prediction -2.00 1.25 1.00
## datasetmovies:task4.Exploration -2.28 1.08 1.00
## oracleDziban:searchDFS:datasetmovies -6.34 2.02 1.00
## oracleDziban:searchDFS:task2.RetrieveValue -5.03 -0.27 1.00
## oracleDziban:searchDFS:task3.Prediction -4.19 0.55 1.00
## oracleDziban:searchDFS:task4.Exploration -3.54 1.29 1.00
## oracleDziban:datasetmovies:task2.RetrieveValue -3.74 1.09 1.00
## oracleDziban:datasetmovies:task3.Prediction -3.76 0.97 1.00
## oracleDziban:datasetmovies:task4.Exploration -4.30 0.49 1.00
## searchDFS:datasetmovies:task2.RetrieveValue -3.86 0.76 1.00
## searchDFS:datasetmovies:task3.Prediction -2.81 1.71 1.00
## searchDFS:datasetmovies:task4.Exploration -1.96 2.67 1.00
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 0.73 7.29 1.00
## oracleDziban:searchDFS:datasetmovies:task3.Prediction -2.05 4.55 1.00
## oracleDziban:searchDFS:datasetmovies:task4.Exploration -0.26 6.49 1.00
## Bulk_ESS Tail_ESS
## Intercept[1] 591 1120
## Intercept[2] 574 857
## Intercept[3] 570 881
## Intercept[4] 613 1038
## oracleDziban 552 1025
## searchDFS 554 1169
## datasetmovies 495 565
## task2.RetrieveValue 680 1386
## task3.Prediction 647 1328
## task4.Exploration 624 1503
## participant_groupstudent 579 1391
## oracleDziban:searchDFS 541 1016
## oracleDziban:datasetmovies 492 780
## searchDFS:datasetmovies 417 764
## oracleDziban:task2.RetrieveValue 675 1706
## oracleDziban:task3.Prediction 671 1314
## oracleDziban:task4.Exploration 618 1484
## searchDFS:task2.RetrieveValue 741 1375
## searchDFS:task3.Prediction 646 1553
## searchDFS:task4.Exploration 683 1595
## datasetmovies:task2.RetrieveValue 694 1446
## datasetmovies:task3.Prediction 724 1301
## datasetmovies:task4.Exploration 701 1622
## oracleDziban:searchDFS:datasetmovies 464 793
## oracleDziban:searchDFS:task2.RetrieveValue 696 1663
## oracleDziban:searchDFS:task3.Prediction 694 1221
## oracleDziban:searchDFS:task4.Exploration 675 1488
## oracleDziban:datasetmovies:task2.RetrieveValue 780 1508
## oracleDziban:datasetmovies:task3.Prediction 774 1457
## oracleDziban:datasetmovies:task4.Exploration 713 1425
## searchDFS:datasetmovies:task2.RetrieveValue 829 1546
## searchDFS:datasetmovies:task3.Prediction 736 1496
## searchDFS:datasetmovies:task4.Exploration 745 1485
## oracleDziban:searchDFS:datasetmovies:task2.RetrieveValue 857 1773
## oracleDziban:searchDFS:datasetmovies:task3.Prediction 820 1439
## oracleDziban:searchDFS:datasetmovies:task4.Exploration 811 1533
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc 1.00 0.00 1.00 1.00 1.00 3000 3000
##
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Trace plots help us check whether there is evidence of non-convergence for our model.
# plot(models$overall)
In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).
pairs(
models$overall,
pars = c("b_Intercept[1]",
"b_Intercept[2]",
"b_Intercept[3]",
"b_Intercept[4]"),
fixed = TRUE
)
pairs(
models$overall,
pars = c("b_datasetmovies",
"b_oracledziban",
"b_searchdfs",
"b_task2.RetrieveValue",
"b_task3.Prediction",
"b_task4.Exploration"),
fixed = TRUE
)
We now look at a response for Overall using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.
overall_plot <- user_response_posterior_draws_plot(user_response_data, models$overall, NULL, "Oracle/Search Combination", "Rating")
overall_plot$plot
We can get the numeric values of the interval boundaries shown above with mean_qi
overall_plot$intervals
## # A tibble: 8 x 8
## # Groups: search [2]
## search oracle rating .lower .upper .width .point .interval
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS CompassQL 0.682 0.203 1.11 0.95 mean qi
## 2 BFS Dziban 0.711 0.250 1.13 0.95 mean qi
## 3 DFS CompassQL 0.178 -0.368 0.662 0.95 mean qi
## 4 DFS Dziban 0.539 0.0469 0.984 0.95 mean qi
## 5 BFS CompassQL 0.682 0.531 0.844 0.5 mean qi
## 6 BFS Dziban 0.711 0.559 0.868 0.5 mean qi
## 7 DFS CompassQL 0.178 0 0.368 0.5 mean qi
## 8 DFS Dziban 0.539 0.391 0.703 0.5 mean qi
## Saving 7 x 5 in image
Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).
overall_predictive_data <- user_response_data %>% add_predicted_draws(models$overall, seed = seed, re_formula = NA)
overall_predictive_data$alg <- paste(overall_predictive_data$search, overall_predictive_data$oracle)
Differences in user score by search algorithm.
search_differences$overall <- user_response_diff_plot(overall_predictive_data, "search", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$overall$plot
Differences in user score by oracle.
oracle_differences$overall <- overall_predictive_data %>%
group_by(oracle, .draw) %>%
summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
compare_levels(rating, by = oracle) %>%
rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$overall$metric = "overall"
oracle_differences$overall %>%
ggplot(aes(x = diff_in_rating, y = "overall")) +
xlab(paste0("Expected Difference in Rating (",oracle_differences$overall[1,'oracle'],")")) +
ylab("Condition")+
stat_halfeye(.width = c(.95, .5)) +
geom_vline(xintercept = 0, linetype = "longdash") +
theme_minimal()
oracle_differences$overall <- user_response_diff_plot(overall_predictive_data, "oracle", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$overall$plot
Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)
overall_predictive_data_subset <- subset(overall_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$overall <- user_response_diff_plot(overall_predictive_data_subset, "alg", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$overall$plot
Differences in user score by participant group
participant_group_differences$overall <- user_response_diff_plot(overall_predictive_data, "participant_group", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$overall$plot
Putting the all of the plots for search algorithm and oracle differences together, split by whether the rating metric is of type confidence or preference We’ll start with differences in search algorithms.
combined_search_differences <- rbind(
search_differences$confidence_udata$differences,
search_differences$confidence_ans$differences,
search_differences$efficiency$differences,
search_differences$ease_of_use$differences,
search_differences$utility$differences,
search_differences$overall$differences)
search_difference_plots_intervals <- user_response_diff_summary(combined_search_differences, 'search')
search_difference_plots_intervals$plot_confidence
View intervals
search_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups: search [1]
## search metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS - DFS Answer -0.0752 -0.606 0.424 0.95 mean qi
## 2 BFS - DFS Understanding Data 0.0484 -0.485 0.636 0.95 mean qi
## 3 BFS - DFS Answer -0.0752 -0.242 0.0909 0.5 mean qi
## 4 BFS - DFS Understanding Data 0.0484 -0.152 0.242 0.5 mean qi
search_difference_plots_intervals$plot_preference
View intervals
search_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups: search [1]
## search metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS - DFS Overall 0.344 -0.273 0.970 0.95 mean qi
## 2 BFS - DFS Utility 0.517 -0.364 1.33 0.95 mean qi
## 3 BFS - DFS Ease of Use 0.588 -0.0303 1.21 0.95 mean qi
## 4 BFS - DFS Efficiency 0.641 -0.121 1.39 0.95 mean qi
## 5 BFS - DFS Overall 0.344 0.121 0.545 0.5 mean qi
## 6 BFS - DFS Utility 0.517 0.242 0.818 0.5 mean qi
## 7 BFS - DFS Ease of Use 0.588 0.364 0.788 0.5 mean qi
## 8 BFS - DFS Efficiency 0.641 0.364 0.909 0.5 mean qi
combined_oracle_differences <- rbind(
oracle_differences$confidence_udata$differences,
oracle_differences$confidence_ans$differences,
oracle_differences$efficiency$differences,
oracle_differences$ease_of_use$differences,
oracle_differences$utility$differences,
oracle_differences$overall$differences)
oracle_difference_plots_intervals <- user_response_diff_summary(combined_oracle_differences, 'oracle')
oracle_difference_plots_intervals$plot_confidence
View intervals
oracle_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups: oracle [1]
## oracle metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 Dziban - Comp… Answer 0.144 -0.394 0.727 0.95 mean qi
## 2 Dziban - Comp… Understandin… 0.209 -0.273 0.697 0.95 mean qi
## 3 Dziban - Comp… Answer 0.144 -0.0606 0.333 0.5 mean qi
## 4 Dziban - Comp… Understandin… 0.209 0.0606 0.364 0.5 mean qi
oracle_difference_plots_intervals$plot_preference
View intervals
oracle_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups: oracle [1]
## oracle metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 Dziban - Compass… Overall 0.205 -0.515 0.879 0.95 mean qi
## 2 Dziban - Compass… Utility 0.290 -0.515 1.09 0.95 mean qi
## 3 Dziban - Compass… Ease of U… 0.0684 -0.606 0.727 0.95 mean qi
## 4 Dziban - Compass… Efficiency 0.242 -0.545 1.03 0.95 mean qi
## 5 Dziban - Compass… Overall 0.205 -0.0303 0.455 0.5 mean qi
## 6 Dziban - Compass… Utility 0.290 0.0303 0.545 0.5 mean qi
## 7 Dziban - Compass… Ease of U… 0.0684 -0.152 0.303 0.5 mean qi
## 8 Dziban - Compass… Efficiency 0.242 -0.0303 0.515 0.5 mean qi
combined_alg_differences <- rbind(
alg_differences$confidence_udata$differences,
alg_differences$confidence_ans$differences,
alg_differences$efficiency$differences,
alg_differences$ease_of_use$differences,
alg_differences$utility$differences,
alg_differences$overall$differences)
alg_difference_plots_intervals <- user_response_diff_summary(combined_alg_differences, 'alg')
alg_difference_plots_intervals$plot_confidence
View intervals
alg_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups: alg [1]
## alg metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS Dziban - DF… Answer 0.0663 -0.647 0.765 0.95 mean qi
## 2 BFS Dziban - DF… Understandi… 0.250 -0.471 1.06 0.95 mean qi
## 3 BFS Dziban - DF… Answer 0.0663 -0.176 0.294 0.5 mean qi
## 4 BFS Dziban - DF… Understandi… 0.250 0 0.529 0.5 mean qi
alg_difference_plots_intervals$plot_preference
View intervals
alg_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups: alg [1]
## alg metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 BFS Dziban - DFS C… Overall 0.533 -0.412 1.47 0.95 mean qi
## 2 BFS Dziban - DFS C… Utility 0.783 -0.294 1.82 0.95 mean qi
## 3 BFS Dziban - DFS C… Ease of … 0.637 -0.294 1.53 0.95 mean qi
## 4 BFS Dziban - DFS C… Efficien… 0.857 -0.235 1.94 0.95 mean qi
## 5 BFS Dziban - DFS C… Overall 0.533 0.235 0.882 0.5 mean qi
## 6 BFS Dziban - DFS C… Utility 0.783 0.412 1.18 0.5 mean qi
## 7 BFS Dziban - DFS C… Ease of … 0.637 0.353 0.941 0.5 mean qi
## 8 BFS Dziban - DFS C… Efficien… 0.857 0.471 1.24 0.5 mean qi
combined_participant_group_differences <- rbind(
participant_group_differences$confidence_udata$differences,
participant_group_differences$confidence_ans$differences,
participant_group_differences$efficiency$differences,
participant_group_differences$ease_of_use$differences,
participant_group_differences$utility$differences,
participant_group_differences$overall$differences)
participant_group_difference_plots_intervals <- user_response_diff_summary(combined_participant_group_differences, 'participant_group')
participant_group_difference_plots_intervals$plot_confidence
View intervals
participant_group_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups: participant_group [1]
## participant_group metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 student - profess… Answer 0.134 -0.275 0.579 0.95 mean qi
## 2 student - profess… Understa… 0.216 -0.183 0.638 0.95 mean qi
## 3 student - profess… Answer 0.134 -0.0115 0.275 0.5 mean qi
## 4 student - profess… Understa… 0.216 0.0769 0.352 0.5 mean qi
participant_group_difference_plots_intervals$plot_preference
View intervals
participant_group_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups: participant_group [1]
## participant_group metric difference .lower .upper .width .point .interval
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 student - professi… Overall 0.413 -0.152 0.987 0.95 mean qi
## 2 student - professi… Utility 0.255 -0.412 0.913 0.95 mean qi
## 3 student - professi… Ease of… 0.344 -0.198 0.902 0.95 mean qi
## 4 student - professi… Efficie… 0.240 -0.412 0.896 0.95 mean qi
## 5 student - professi… Overall 0.413 0.217 0.606 0.5 mean qi
## 6 student - professi… Utility 0.255 0.0264 0.487 0.5 mean qi
## 7 student - professi… Ease of… 0.344 0.156 0.531 0.5 mean qi
## 8 student - professi… Efficie… 0.240 0.0115 0.467 0.5 mean qi